Multiplex assessment of protein variant abundance by massively parallel sequencing VAMP-seq - multiplex assay that uses fluorescent reporters to measure the steady-state abundance of protein variants in cultured human cells (each cell expresses a single variant directly fused to EGFP…the stability of the variant dictates the abundance of the EGFP fusion and, accordingly, the green fluorescence signal of the cell) - used to assess PTEN and TPMT variants

# Scatter plot of VAMP-seq scores relative to variant position in protein, colored by secondary struct
#essentially getting rid of the last row (wt)
pten1_proc_wt <- pten1_proc[!is.na(pten1_proc$position),]
#creating new column that specifices secondary structure (uses info from helix and sheet columns)
pten1_proc_wt$secondary_struct <- ifelse(is.na(pten1_proc_wt$helix), "unknown",
                        ifelse(pten1_proc_wt$helix==1, "helix",
                        ifelse(pten1_proc_wt$sheet==1, "sheet",
                        ifelse(pten1_proc_wt$helix==0, "neither",
                        "unknown"))))
pten_pos <- ggplot(pten1_proc_wt, aes(x=position, y=score, colour=secondary_struct))+ geom_point(size=.3) + scale_x_continuous(minor_breaks = seq(0, 420, 5)) + scale_color_manual(values=c("#FF4848", "#00C853", "#5757FF", "#A9A9A9")) +ylab("VAMP-seq score")+xlab("Position in PTEN")+labs(colour="Secondary Structure")+ggtitle("PTEN scores in relation to protein structure") + geom_vline(xintercept=27, color="black", size=.1) + geom_vline(xintercept=55, color="black", size=.1) + geom_vline(xintercept=70, color="black", size=.1) + geom_vline(xintercept=85, color="black", size=.1) + geom_vline(xintercept=164.5, color="black", size=.1) + geom_vline(xintercept=212, color="black", size=.1) + geom_vline(xintercept=267.5, color="black", size=.1) + geom_vline(xintercept=343.5, color="black", size=.1)+theme_bw()
#colored by change in hydrophobicity instead of secondary struct
pten_hydro <- ggplot(pten1_proc_wt, aes(x=position, y=score, colour=(hydro2-hydro1)))+ geom_point(size=.3, alpha = 0.3) + scale_x_continuous(minor_breaks = seq(0, 420, 5)) + ylab("VAMP-seq score")+xlab("Position in PTEN")+labs(colour="Hydrophobicity")+ggtitle("PTEN scores in relation to change in hydrophobicity") + geom_vline(xintercept=27, color="black", size=.1) + geom_vline(xintercept=55, color="black", size=.1) + geom_vline(xintercept=70, color="black", size=.1) + geom_vline(xintercept=85, color="black", size=.1) + geom_vline(xintercept=164.5, color="black", size=.1) + geom_vline(xintercept=212, color="black", size=.1) + geom_vline(xintercept=267.5, color="black", size=.1) + geom_vline(xintercept=343.5, color="black", size=.1)
#repeat for tpmt
tpmt1_proc_wt <- tpmt1_proc[!is.na(tpmt1_proc$position),]
tpmt1_proc_wt$secondary_struct <- ifelse(is.na(tpmt1_proc_wt$helix), "unknown",
                        ifelse(tpmt1_proc_wt$helix==1, "helix",
                        ifelse(tpmt1_proc_wt$sheet==1, "sheet",
                        ifelse(tpmt1_proc_wt$helix==0, "neither",
                        "unknown"))))
tpmt_pos <- ggplot(tpmt1_proc_wt, aes(x=position, y=score, colour=secondary_struct))+ geom_point(size=.3) + scale_x_continuous(minor_breaks = seq(0, 405, 5)) + scale_color_manual(values=c("#FF4848", "#00C853", "#5757FF", "#A9A9A9")) +ylab("VAMP-seq score")+xlab("Position in TPMT")+labs(colour="Secondary Structure")+ggtitle("TPMT scores in relation to protein structure") + geom_vline(xintercept=47, color="black", size=.1) + geom_vline(xintercept=78, color="black", size=.1) + geom_vline(xintercept=122.5, color="black", size=.1) + geom_vline(xintercept=140, color="black", size=.1) + geom_vline(xintercept=165, color="black", size=.1) + geom_vline(xintercept=194, color="black", size=.1) + geom_vline(xintercept=209, color="black", size=.1)+theme_bw()
#creating a copy of tpmt1_proc_wt, addition of column that notes change from one secondary struct to another (for violin plot purposes) (plot tpmt_pos_vp to see)
tpmt_colors <- tpmt1_proc_wt
tpmt_colors$fact <- rep(10, nrow(tpmt_colors))
temp <- 1
for(i in 1:(length(tpmt_colors$fact)-1)) {
  if (tpmt_colors$secondary_struct[i] != tpmt_colors$secondary_struct[i+1]) {
    tpmt_colors$fact[i] <- temp
    temp <- temp + 1
  } else {
  tpmt_colors$fact[i] <- temp
  }
}
tpmt_colors$fact[length(tpmt_colors$fact)] <- temp
tpmt_pos_vp <- ggplot(tpmt_colors, aes(x=position, y=score))+ geom_violin(data=tpmt_colors[c(1:2783, 2798:4000),], aes(fill=as.character(fact), colour = factor(TRUE)), draw_quantiles = c(0.5), scale = "width") + 
scale_fill_manual(values=c("1" = "#A9A9A9", "2" = "#00C853", "3" = "#FF4848", "4" = "#00C853","5" = "#FF4848", "6" = "#00C853","7" = "#5757FF", "8" = "#00C853","9" = "#FF4848","10" = "#00C853","11" = "#5757FF", "12" = "#00C853","13" = "#FF4848", "14" = "#00C853", "15" = "#5757FF", "16" = "#00C853", "17" = "#5757FF", "18" = "#00C853", "19" = "#5757FF", "20" = "#00C853", "21" = "#5757FF", "22" = "#00C853", "23" = "#FF4848", "24" = "#5757FF", "25" = "#00C853", "26" = "#FF4848", "27" = "#00C853", "28" = "#5757FF", "29" = "#00C853", "30" = "#5757FF", "31" = "#00C853")) + scale_colour_manual(values = c("black")) + scale_x_continuous(minor_breaks = seq(0, 405, 5)) + ylab("VAMP-seq score")+xlab("Position in TPMT")+labs(colour="Secondary Structure")+ggtitle("TPMT scores in relation to protein structure") + geom_vline(xintercept=47, color="black", size=.1) + geom_vline(xintercept=78, color="black", size=.1) + geom_vline(xintercept=122.5, color="black", size=.1) + geom_vline(xintercept=140, color="black", size=.1) + geom_vline(xintercept=165, color="black", size=.1) + geom_vline(xintercept=194, color="black", size=.1) + geom_vline(xintercept=209, color="black", size=.1)
# Scatter plot with change of hydrophobicity along x-axis, abundance score along y
pten_hydro1 <- ggplot(pten1_proc_wt, aes(y=score, x=(hydro2-hydro1)))+ geom_point(size=0.5, alpha = 0.3) + ylab("Hydrophobicity")+xlab("VAMP-seq score")+ggtitle("PTEN scores in relation to change in hydrophobicity")
# Violin and scatter plots of 
pten_aa_spread <- ggplot(pten1_proc_wt, aes(y=score, x=end)) + geom_violin(draw_quantiles=c(0.5))
pten_aa_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=end)) + geom_point(size = 0.5, alpha = 0.3, position = position_jitter(width = 0.1, height = NULL))
plot(pten_pos)

plot(tpmt_pos)

#plot(tpmt_pos_vp)
#plot(pten_hydro)
#plot(pten_hydro1)
plot(pten_aa_spread)

plot(pten_aa_spread1)

# Looking for possible correlation between abundance and hydrogen bonding
# preprocessing data (probably unecessary, removes rows with NA in hbond_sum, adds column that once again denotes secondary structure)
pten1_hbond <- pten1_proc[!is.na(pten1_proc$hbond_sum),]
pten1_hbond$secondary_struct <- ifelse(is.na(pten1_hbond$helix), "unknown",
                        ifelse(pten1_hbond$helix==1, "helix",
                        ifelse(pten1_hbond$sheet==1, "sheet",
                        ifelse(pten1_hbond$helix==0, "neither",
                        "unknown"))))
# Scatter of abundance score vs hbond_sum (colored by secondary struct)
pten_plot_hbond <- ggplot(pten1_hbond, aes(x=hbond_sum, y=score, colour=secondary_struct))+ geom_point(alpha=0.4) + ylab("VAMP-seq score")+xlab("DSSP Sum of hydrogen bonds")+ggtitle("PTEN scores in relation to hydrogen bonding") + scale_color_manual(values=c("#FF4848", "#696969", "#5757FF")) + labs(colour="Secondary Structure") + theme_bw()
plot(pten_plot_hbond)

# (uncolored)
pten_plot_hbond1 <- ggplot(pten1_hbond, aes(x=hbond_sum, y=score))+ geom_point(alpha = 0.2) + ylab("VAMP-seq score")+xlab("DSSP Sum of hydrogen bonds")+ggtitle("PTEN scores in relation to hydrogen bonding") + theme_bw()
plot(pten_plot_hbond1)

#repeat for tpmt
tpmt1_hbond <- tpmt1_proc[!is.na(tpmt1_proc$hbond_sum),]
tpmt1_hbond$secondary_struct <- ifelse(is.na(tpmt1_hbond$helix), "unknown",
                        ifelse(tpmt1_hbond$helix==1, "helix",
                        ifelse(tpmt1_hbond$sheet==1, "sheet",
                        ifelse(tpmt1_hbond$helix==0, "neither",
                        "unknown"))))
tpmt_plot_hbond <- ggplot(tpmt1_hbond, aes(x=hbond_sum, y=score, colour=secondary_struct))+ geom_point(alpha=0.4) + ylab("VAMP-seq score")+xlab("DSSP Sum of hydrogen bonds")+ggtitle("PTEN scores in relation to hydrogen bonding") + scale_color_manual(values=c("#FF4848", "#696969", "#5757FF")) + labs(colour="Secondary Structure") + theme_bw()
plot(tpmt_plot_hbond)

tpmt_plot_hbond1 <- ggplot(tpmt1_hbond, aes(x=hbond_sum, y=score))+ geom_point(alpha = 0.2) + ylab("VAMP-seq score")+xlab("DSSP Sum of hydrogen bonds")+ggtitle("TPMT scores in relation to hydrogen bonding") + theme_bw()
plot(tpmt_plot_hbond1)

#less hydrogen bonds ~ higher abundance
#method1 for visualizing tracks (basic, for visualizing in rstudio)
grid.newpage()
grid.draw(rbind(ggplotGrob(tpmt_dssp_schematic), ggplotGrob(tpmt_pos_mean), ggplotGrob(tpmt_heat), size = "last"))

grid.newpage()
grid.draw(rbind(ggplotGrob(pten_dssp_schematic), ggplotGrob(pten_pos_mean), ggplotGrob(pten_heat), size = "last"))
#plotting mean score vs variant changed to 
tpmt_end_sum <- summarySE(tpmt1_proc_wt, measurevar="score", groupvars="end")
tpmt_end_mean <- ggplot(tpmt_end_sum, aes(x=end, y=score)) +
  geom_bar(position=position_dodge(), stat="identity", colour="#999999") +
  geom_errorbar(aes(ymin=score-sd, ymax=score+sd), width=0.001, position=position_dodge()) +
  ylab("mean abundance") + xlab("variant amino acid") + theme_bw()
plot(tpmt_end_mean)

#plotting scores vs 'end' colored by location/secondary structure
tpmt_end_scores_b <- ggplot(data=subset(tpmt1_proc_wt, sheet==1), aes(x=end, y=score, colour=position)) +
  geom_violin(data=subset(tpmt1_proc_wt, sheet==1), draw_quantiles=0.5, scale = "width") +
  geom_point(data=subset(tpmt1_proc_wt, sheet==1), size=.3, alpha = 0.6, position=jitter2) + ylab("abundance") + xlab("variant aa") +
  scale_y_continuous(limits = c(-0.8, 2.2))
tpmt_end_scores_a <- ggplot(data=subset(tpmt1_proc_wt, helix==1), aes(x=end, y=score, colour=position)) +
  geom_violin(data=subset(tpmt1_proc_wt, helix==1), draw_quantiles=0.5, scale = "width") +
  geom_point(data=subset(tpmt1_proc_wt, helix==1), size=.3, alpha = 0.6, position=jitter2) + ylab("abundance") + xlab("variant aa") +
  scale_y_continuous(limits = c(-0.8, 2.2))
tpmt_end_scores_n <- ggplot(data=subset(tpmt1_proc_wt, secondary_struct=="neither"), aes(x=end, y=score, colour=position)) +
  geom_violin(data=subset(tpmt1_proc_wt, secondary_struct=="neither"), draw_quantiles=0.5, scale = "width") +
  geom_point(data=subset(tpmt1_proc_wt, secondary_struct=="neither"), size=.3, alpha = 0.6, position=jitter2) + ylab("abundance") + xlab("variant aa") +
  scale_y_continuous(limits = c(-0.8, 2.2))
plot(tpmt_end_scores_b)

plot(tpmt_end_scores_a)

plot(tpmt_end_scores_n)

#tpmt_end_scores_60 <- ggplot(data=subset(tpmt1_proc_wt, position<=60), aes(x=end, y=score, colour=position)) +
#  geom_violin(data=subset(tpmt1_proc_wt, position<=60), draw_quantiles=0.5, scale = "width") +
#  geom_point(data=subset(tpmt1_proc_wt, position<=60), size=.3, alpha = 0.6, position=jitter2) + ylab("abundance") + xlab("variant aa") +
#  scale_y_continuous(limits = c(-0.8, 2.2))
#tpmt_end_scores_120 <- ggplot(data=subset(tpmt1_proc_wt, position>60 & position<=120), aes(x=end, y=score, colour=position)) +
#  geom_violin(data=subset(tpmt1_proc_wt, position>60 & position<=120), draw_quantiles=0.5, scale = "width") +
#  geom_point(data=subset(tpmt1_proc_wt, position>60 & position<=120), size=.3, alpha = 0.6, position=jitter2) + ylab("abundance") + xlab("variant aa") +
#  scale_y_continuous(limits = c(-0.8, 2.2))
#tpmt_end_scores_180 <- ggplot(data=subset(tpmt1_proc_wt, position>120 & position<=180), aes(x=end, y=score, colour=position)) +
#  geom_violin(data=subset(tpmt1_proc_wt, position>120 & position<=180), draw_quantiles=0.5, scale = "width") +
#  geom_point(data=subset(tpmt1_proc_wt, position>120 & position<=180), size=.3, alpha = 0.6, position=jitter2) + ylab("abundance") + xlab("variant aa") +
#  scale_y_continuous(limits = c(-0.8, 2.2))
#tpmt_end_scores_245 <- ggplot(data=subset(tpmt1_proc_wt, position>180 & position<=245), aes(x=end, y=score, colour=position)) +
#  geom_violin(data=subset(tpmt1_proc_wt, position>180 & position<=245), draw_quantiles=0.5, scale = "width") +
#  geom_point(data=subset(tpmt1_proc_wt, position>180 & position<=245), size=.3, alpha = 0.6, position=jitter2) + ylab("abundance") + xlab("variant aa") +
#  scale_y_continuous(limits = c(-0.8, 2.2))
#plot(tpmt_end_scores_60)
#plot(tpmt_end_scores_120)
#plot(tpmt_end_scores_180)
#plot(tpmt_end_scores_245)

# More of above
pten_a_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "A"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Alanine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "A"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()

pten_a_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "A"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Alanine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "A"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

pten_r_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "R"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Arganine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "R"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()

pten_r_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "R"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Arganine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "R"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

pten_n_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "N"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Asparagine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "N"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()

pten_n_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "N"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Asparagine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "N"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

pten_d_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "D"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Aspartic Acid variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "D"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()

pten_d_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "D"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Aspartic Acid variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "D"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

pten_n_hydrodiff <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "N"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Asparagine variant abundance scores w/ hydrophobicity change")+ geom_point(data=subset(pten1_proc_wt, start== "N"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_distiller(palette = "Spectral") + theme_bw()

pten_d_hydrodiff <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "D"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Aspartic Acid variant abundance scores w/ hydrophobicity change")+ geom_point(data=subset(pten1_proc_wt, start== "D"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_distiller(palette = "Spectral") + theme_bw()

plot(pten_a_spread1)
plot(pten_a_aa)
plot(pten_r_spread1)
plot(pten_r_aa)
plot(pten_n_spread1)
plot(pten_n_hydrodiff)
plot(pten_n_aa)
plot(pten_d_spread1)
plot(pten_d_hydrodiff)
plot(pten_d_aa)
ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "P"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Proline variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "P"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()

ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "P"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Proline variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "P"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "P"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Proline variant abundance scores w/ hydrophobicity change")+ geom_point(data=subset(pten1_proc_wt, start== "P"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_distiller(palette = "Spectral") + theme_bw()

#graphs for Threonine
ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "T"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("T variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "T"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()

ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "T"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("T variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "T"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "T"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("T variant abundance scores w/ hydrophobicity change")+ geom_point(data=subset(pten1_proc_wt, start== "T"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_distiller(palette = "Spectral") + theme_bw()

# To plot positions with low stddev (excluding nonsense)

#creating new dataframe which includes mean abundance and variance, and discludes nonsense mutations (nonsense mutations cut proteins short and usually result in very low abundance scores, so I've decided to treat them separately from missense/synonymous mutations)
pten_variance <- summarySE((subset(pten1_data, end != "X")), measurevar="score", groupvars="position", na.rm=TRUE)

# Filtering for positions with enough variants and low variance in variant abundance scores
#adjust N (minimum # variants at a position) and stddev to liking
#5, 0.11
#10, 0.15
pten_variance_filtered <- subset(subset(pten_variance, N > 8 ), sd < 0.11)

#nonsense variant scores are graphed as dots, but are not included in the overlaying violin plots
ggplot(no_nonsense, aes(y=score, x=factor(position), colour = end)) + 
  geom_violin(draw_quantiles=c(0.5), data=subset(no_nonsense, no_nonsense$position %in% pten_variance_filtered$position), aes(group=factor(position)), scale = "width") + 
  xlab("Position in PTEN") + ggtitle("Intolerant and tolerant amino acid positions") +
  geom_point(data=subset(pten1_proc_wt, pten1_proc_wt$position %in% pten_variance_filtered$position), aes(x=factor(position), y=score), alpha = 0.85, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

ggplot(no_nonsense, aes(y=score, x=position, colour = end)) + 
  geom_violin(draw_quantiles=c(0.5), data=subset(no_nonsense, no_nonsense$position %in% pten_variance_filtered$position), aes(group=position%%450), scale = "width") + 
  xlab("Position in PTEN") + ggtitle("Intolerant and tolerant amino acid positions") +
  geom_point(data=subset(pten1_proc_wt, pten1_proc_wt$position %in% pten_variance_filtered$position), aes(x=position, y=score), alpha = 0.85, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()
# Repeat of above ,just testing different N and stddev parameter 
pten_variance_filtered1 <- subset(subset(pten_variance, N > 10), sd > 0.25)
ggplot(no_nonsense, aes(y=score, x=factor(position), colour = end)) + 
  geom_violin(draw_quantiles=c(0.5), data=subset(no_nonsense, no_nonsense$position %in% pten_variance_filtered1$position), aes(group=factor(position)), scale = "width") + 
  xlab("Position in PTEN") + ggtitle("Intolerant and tolerant amino acid positions") +
  geom_point(data=subset(pten1_proc_wt, pten1_proc_wt$position %in% pten_variance_filtered1$position), aes(x=factor(position), y=score), alpha = 0.85, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

ggplot(no_nonsense, aes(y=score, x=position, colour = end)) + 
  geom_violin(draw_quantiles=c(0.5), data=subset(no_nonsense, no_nonsense$position %in% pten_variance_filtered1$position), aes(group=position%%450), scale = "width") + 
  xlab("Position in PTEN") + ggtitle("Intolerant and tolerant amino acid positions") +
  geom_point(data=subset(pten1_proc_wt, pten1_proc_wt$position %in% pten_variance_filtered1$position), aes(x=position, y=score), alpha = 0.85, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

chosen <- c(61, 68, 105, 108, 123, 127, 130, 132, 135, 155, 165, 173, 174, 246, 323, 335)
ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + 
  geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, pten1_proc_wt$position %in% chosen), aes(group=factor(position)), scale = "width") + 
  xlab("Position in PTEN") + ggtitle("Clinvar path/likely path variant positions") +
  geom_point(data=subset(pten1_proc_wt, pten1_proc_wt$position %in% chosen), aes(x=factor(position), y=score), alpha = 0.85, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

# New track for heat-mean-variance-secondary-struct plot

#this is how pten_variance was defined: pten_variance <- summarySE((subset(pten1_data, end != "X")), measurevar="score", groupvars="position", na.rm=TRUE)
pten_variance_filtered4 <-subset(pten_variance, N>5)
variance_bar <- ggplot(pten_variance_filtered4, aes(y=sd, x=position)) +
  geom_segment(aes(x=position, xend=position, y=0, yend=sd), color="grey68") +
  geom_point(size=0.5) +
  scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) + scale_y_continuous(expand = c(0,0))+theme(axis.title.x = element_blank(), axis.text.x = element_blank(), legend.position="none")
plot(variance_bar)

gd=ggplot_gtable(ggplot_build(variance_bar))
maxWidth = grid::unit.pmax(ga$widths, gb$widths, gc$widths, gd$widths)
ga$widths <- as.list(maxWidth)
gb$widths <- as.list(maxWidth)
gc$widths <- as.list(maxWidth)
gd$widths <- as.list(maxWidth)
grid.newpage()

##storing, with specified widths!!
#pdf('pten_tpmt_mean_heat_variance.pdf', width=8, height=6)
#grid.arrange(arrangeGrob(gc,gd,ga,gb,nrow=4,heights=c(.1,.15,.15,.6)))
#dev.off()
# Multiplex paper's Fig2b (PTEN) shows trailing tails for the nonsense and synonymous variant scores
# this is to identify and investigate why variants in this tail are so far from their means
pten1_nonsense <- subset(pten1_proc, class == "nonsense")
tpmt1_nonsense <- subset(tpmt1_proc, class == "nonsense")
pten1_synon <- subset(pten1_proc, class == "synonymous")
tpmt1_synon <- subset(tpmt1_proc, class == "synonymous")
pten1_no_missense <- subset(pten1_proc, class == "synonymous" | class == "nonsense")
ggplot(pten1_nonsense, aes(x=score)) + geom_histogram(binwidth=.01, colour="blue", fill="white") + theme_bw()

#+ geom_density()
ggplot(pten1_synon, aes(x=score)) + geom_histogram(binwidth=.01, colour="red", fill="white") + theme_bw()

ggplot(pten1_proc_wt, aes(x=score)) + geom_histogram(data=subset(pten1_proc_wt,class == "nonsense"), fill = "red", alpha = 0.5, binwidth=.01) + geom_histogram(data=subset(pten1_proc_wt,class == "synonymous"), fill = "blue", alpha = 0.5, binwidth=.01) + geom_histogram(data=subset(pten1_proc_wt,class == "missense"), fill = "green", alpha = 0.2, binwidth=.01) + theme_bw()

ggplot(pten1_no_missense, aes(x=score)) + geom_histogram(data=subset(pten1_no_missense,class == "nonsense"), fill = "red", alpha = 0.5, binwidth=.01) + geom_histogram(data=subset(pten1_no_missense,class == "synonymous"), fill = "blue", alpha = 0.5, binwidth=.01) + theme_bw()

# TPMT
#ggplot(tpmt1_nonsense, aes(x=score)) + geom_histogram(binwidth=.01, colour="blue", fill="white") + theme_bw()
#ggplot(tpmt1_synon, aes(x=score)) + geom_histogram(binwidth=.01, colour="red", fill="white") + theme_bw()
# we can see the paper used a different scale, which was ... deceiving
# not many variants are nonsense or synonymous, so the tail is not actually that large for both of them
# Taking a subset of the data, all variants above a certain score (for nonsense), and below a certain score (for synonymous)
#0.55
nonsense_tail <- subset(pten1_nonsense, score > 0.6)
synon_tail <- subset(pten1_synon, score < 0.6)
nonsense_tail$secondary_struct <- ifelse(is.na(nonsense_tail$helix), "unknown",
                        ifelse(nonsense_tail$helix==1, "helix",
                        ifelse(nonsense_tail$sheet==1, "sheet",
                        ifelse(nonsense_tail$helix==0, "neither",
                        "unknown"))))
synon_tail$secondary_struct <- ifelse(is.na(synon_tail$helix), "unknown",
                        ifelse(synon_tail$helix==1, "helix",
                        ifelse(synon_tail$sheet==1, "sheet",
                        ifelse(synon_tail$helix==0, "neither",
                        "unknown"))))
n_tail <- nonsense_tail[,c(1,2,7,30,127)]
s_tail <- synon_tail[,c(1,2,7,30,127)]
n_tail$bp_pos <- (n_tail$position-1)*3
s_tail$bp_pos <- (s_tail$position-1)*3
n_tail
s_tail
# Graphing the positions of these variants in the protein
#just in case there is a discernible pattern
s_tail_pos <- ggplot(s_tail, aes(x=position, y=score, colour=secondary_struct))+ geom_point(size=.3) + scale_x_continuous(minor_breaks = seq(0, 405, 5)) + scale_color_manual(values=c("#FF4848", "#00C853", "#5757FF", "#A9A9A9")) +ylab("VAMP-seq score")+xlab("Position in PTEN")+labs(colour="Secondary Structure")+ggtitle("PTEN synonymous variant tail scores in relation to protein structure") + theme_bw()
plot(s_tail_pos)

n_tail_pos <- ggplot(n_tail, aes(x=position, y=score, colour=secondary_struct))+ geom_point(size=.3) + scale_x_continuous(minor_breaks = seq(0, 405, 5)) + scale_color_manual(values=c("#FF4848", "#00C853", "#5757FF", "#A9A9A9")) +ylab("VAMP-seq score")+xlab("Position in PTEN")+labs(colour="Secondary Structure")+ggtitle("PTEN nonsense variant tail scores in relation to protein structure") + theme_bw()
plot(n_tail_pos)

TPMT_abun_CADD <- ggplot(tpmt_merge, aes(x=abundance_class, y=CADD_raw_rankscore)) + geom_violin(draw_quantiles = c( 0.5))+ylab("CADD raw rankscore")+xlab("Abundance Class") + theme_bw()
plot(TPMT_abun_CADD)

TPMT_abun_SIFT_conv <- ggplot(tpmt_merge, aes(x=abundance_class, y=as.numeric(SIFT_converted_rankscore))) + geom_violin(draw_quantiles = c(0.5))+ylab("SIFT conv rankscore")+xlab("Abundance Class") + theme_bw()
plot(TPMT_abun_SIFT_conv)

TPMT_abun_POLY <- ggplot(tpmt_merge, aes(x=abundance_class, y=as.numeric(Polyphen2_HDIV_rankscore))) + geom_violin(draw_quantiles = c( 0.5))+ylab("Polyphen2 HDIV rankscore")+xlab("Abundance Class") + theme_bw()
plot(TPMT_abun_POLY)

TPMT_abun_POLY1 <- ggplot(tpmt_merge, aes(x=abundance_class, y=as.numeric(Polyphen2_HVAR_rankscore))) + geom_violin(draw_quantiles = c( 0.5))+ylab("Polyphen2 HVAR rankscore")+xlab("Abundance Class") + theme_bw()
plot(TPMT_abun_POLY1)

# Bar plots of what Polyphen and SIFT scores make up each abundance class!
Pred_abun_SIFT <- ggplot(tpmt_merge, aes(abundance_class)) + geom_bar(aes(fill = SIFT_pred)) + ggtitle("Abundance class vs SIFT prediction of Damaging or Tolerated") + theme_bw()
plot(Pred_abun_SIFT)

trial_sep <- tpmt_merge[c(21,23,24,26)]
tpmt_merge_expand <- separate_rows(tpmt_merge, c("Polyphen2_HDIV_score", "Polyphen2_HDIV_pred", "Polyphen2_HVAR_score", "Polyphen2_HVAR_pred"))
Pred_abun_HVAR <- ggplot(tpmt_merge_expand, aes(abundance_class)) + geom_bar(aes(fill = Polyphen2_HVAR_pred)) + ggtitle("Abundance class vs Polyphen2 HVAR predictions") + labs(subtitle = "D: Probably Damaging, P: Possibly Damaging, B: Benign") + theme_bw()
plot(Pred_abun_HVAR)

# Focusing on individual "end", or variant, amino acids! A lot more can be done... and a lot of what I have here is unfocused and too busy to be helpful
twenty_color1 = c("#D02028", "#A4C33B","#53958B", "#E6A3B4", "#C5A0CA", "#554DA0", "#99247E", "#402059", "#82421B", "#7E807E", 'black', "#EDD941", "#F2F08E", "#EEC898", "#E1A12F", "#76C158",  "#BCDDAE", "#85782E", "#315935", "#A1DAE0", "#486EB6")
pten_dssp_schematic1 <- ggplot() +
  geom_segment(aes(x = 1, y = 0, xend = max(pten_extra$position)), yend = 0, size = 1, color = "grey70") +
  geom_point(data = subset(pten_extra, !is.na(xca)), aes(x = position, y = 0), color = "black", size = 1.8) +
  geom_point(data = subset(pten_extra, sheet == 1), aes(x = position, y = 0), color = "pink", size = 1.5) +
  geom_point(data = subset(pten_extra, helix == 1), aes(x = position, y = 0), color = "cyan", size = 1.5) +
  scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) +
  scale_y_continuous(breaks = NULL, expand = c(0,0)) +xlab("Position in PTEN") + ylab("\n \n \n") +
  theme(panel.border = element_blank(), axis.text.y = element_blank())
aas <- c("A", "C", "P", "X")
aas1 <- c("S", "C", "P", "X")
aas2 <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y")
pten_a_pos <- ggplot(data=subset(pten1_proc_wt, end=="A"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="A" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Alanine variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)
#plot(pten_a_pos)
pten_s_pos <- ggplot(data=subset(pten1_proc_wt, end=="S"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="S" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Serine variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)
pten_c_pos <- ggplot(data=subset(pten1_proc_wt, end=="C"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="C" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Cysteine variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)
pten_x_pos <- ggplot(data=subset(pten1_proc_wt, end=="X"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="X" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of nonsense variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)
grid.newpage()
grid.draw(rbind(ggplotGrob(pten_a_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))

grid.newpage()
grid.draw(rbind(ggplotGrob(pten_s_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))

grid.newpage()
grid.draw(rbind(ggplotGrob(pten_c_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))

grid.newpage()
grid.draw(rbind(ggplotGrob(pten_x_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))

# More of above
#snake at the end (arches over grey region)
pten_p_pos <- ggplot(data=subset(pten1_proc_wt, end=="P"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="P" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Proline variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)
grid.newpage()
grid.draw(rbind(ggplotGrob(pten_p_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))
# More of above
#matching snake at the end!
pten_g_pos <- ggplot(data=subset(pten1_proc_wt, end=="G"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="G" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Glycine variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)
grid.newpage()
grid.draw(rbind(ggplotGrob(pten_g_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))

pten_h_pos <- ggplot(data=subset(pten1_proc_wt, end=="H"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="H" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Histidine variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)
grid.newpage()
grid.draw(rbind(ggplotGrob(pten_h_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))

# notice 2nd from last grey region
pten_w_pos <- ggplot(data=subset(pten1_proc_wt, end=="W"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="W" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Typtophan variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)
grid.newpage()
grid.draw(rbind(ggplotGrob(pten_w_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))

---
title: "PTEN R Notebook"
output: html_notebook
---
```{r global_options, include=FALSE}
library(knitr)
library(ggplot2)
require(gridExtra)
library(reshape2)
library(pracma)
library(ggbeeswarm)
library(Rmisc)
library(grid)
library(EBImage)
library(googlesheets)
library(tidyr)
library(dplyr)
knitr::opts_chunk$set(fig.width=12, fig.height=12, warning=FALSE)
```
Multiplex assessment of protein variant abundance by massively parallel sequencing
VAMP-seq
- multiplex assay that uses fluorescent reporters to measure the steady-state abundance of protein variants in cultured human cells (each cell expresses a single variant directly fused to EGFP...the stability of the variant dictates the abundance of the EGFP fusion and, accordingly, the green fluorescence signal of the cell)
- used to assess PTEN and TPMT variants 
```{r echo=FALSE}
par(pch=20, cex=.6)
#local
#source = https://www.nature.com/articles/s41588-018-0122-z, Supplementary Dataset 1, 2
pten1_data <- read.delim('~/leklab/leklab/pten1.txt')
tpmt1_data <- read.delim('~/leklab/leklab/tpmt_suppl_2.txt')

#processed verson where rows with abundance class = NA are deleted
pten1_proc <- pten1_data[!is.na(pten1_data$abundance_class),]
tpmt1_proc <- tpmt1_data[!is.na(tpmt1_data$abundance_class),]
```
```{r echo=FALSE}
# Boxplot of abundane scores vs abundance class (low, possibly low, etc) for PTEN and TPMT
dd <- data.frame(pten1_proc$abundance_class,pten1_proc$score)
colnames(dd) <- c("abundance_class", "score")
ee <- data.frame(tpmt1_proc$abundance_class,tpmt1_proc$score)
colnames(ee) <- c("abundance_class", "score")
dd$protein <- rep("PTEN", nrow(dd))
ee$protein <- rep("TPMT", nrow(ee))
ff = data.frame(rbind(dd, ee))
bbpp = boxplot(score~protein+abundance_class, data = ff, at = c(1, 1.8, 3, 3.8, 5, 5.8, 7.2, 8), xaxt='n', col = c('white', 'gray'))
axis(side=1, at=c(1.4, 3.4, 5.4, 7.6), labels=c('low', 'possibly low', 'possibly\n wt-like', 'wt-like'))
title('VAMP-seq scores of PTEN and TPMT Variants\nand abundance class')
```

```{r echo=FALSE}
# Violin Plot version of above
VAMP_abundance <- ggplot(ff, aes(x=abundance_class, y=score, fill=protein)) + geom_violin(draw_quantiles = 0.5)+ylab("VAMP-seq score")+xlab("Abundance Class")+theme(legend.title=element_blank(), panel.grid.major = element_line(colour = "grey"), panel.grid.minor = element_line(colour = "grey"))+ggtitle("VAMP-seq scores for each abundance classification")+geom_point(data=data.frame(x="wt-like", y=1, protein = "PTEN"), aes(x,y), colour="black", size=1.5, show.legend=FALSE)+annotate("text", x = "wt-like", y=1.09, label = "WT",colour= "black", size = 4) + scale_y_continuous(minor_breaks = seq(-2, 2, .25))+theme_bw()
plot(VAMP_abundance)
```

```{r echo=FALSE}
#combining pten1_data and tpmt1_data into one large data frame, differentiate between the two w/ column 'protein' which specifies 'PTEN' or 'TPMT'
pten1_data$protein <- rep("PTEN", nrow(pten1_data))
tpmt1_data$protein <- rep("TPMT", nrow(tpmt1_data))
common_cols <- intersect(colnames(pten1_data), colnames(tpmt1_data))
comb_data = rbind(subset(pten1_data, select = common_cols), subset(tpmt1_data, select = common_cols))

#plots helix vs score for PTEN and TPMT side by side
#getting rid of rows with NA in helix column
comb_data_helix <- comb_data[!is.na(comb_data$helix),]
comb_data_sheet <- comb_data[!is.na(comb_data$sheet),]
#unecessary step (gets rid of row removal warning when plotting)
ck <- comb_data_helix[!is.na(comb_data_helix$abundance_class),]
ck1 <- comb_data_sheet[!is.na(comb_data_sheet$abundance_class),]

#Violin plots of PTEN and TPMT variant abundance scores grouped by secondary structure
h_plot <- ggplot(ck, aes(x=as.factor(helix), y=score, fill=protein)) + geom_violin(data=subset(ck, helix==1), draw_quantiles = c(0.5)) + guides(fill=FALSE) + xlab("Alpha Helix") + ylab("VAMP-seq score") + theme(axis.text.x = element_blank()) + scale_y_continuous(limits = c(-.7, 2.03))

s_plot <- ggplot(ck1, aes(x=as.factor(sheet), y=score, fill=protein)) + geom_violin(data=subset(ck1, sheet==1), draw_quantiles = c(0.5)) +  theme(axis.title.y = element_blank(), axis.text.y = element_blank(), axis.text.x = element_blank()) + xlab("Beta Sheet") + scale_y_continuous(limits = c(-.7, 2.03)) + guides(fill=FALSE) 

n_plot <- ggplot(ck, aes(x=as.factor(helix), y=score, fill=protein)) + geom_violin(data=subset(ck, helix==0 & sheet==0), draw_quantiles = c( 0.5)) + theme( axis.title.y = element_blank(), axis.text.y = element_blank(), axis.text.x = element_blank(), legend.justification=c(1,0), legend.position=c(.49,.75), legend.title=element_blank(), legend.text = element_text(size=10)) + xlab("Other") + scale_y_continuous(limits = c(-.7, 2.03))

#put the plots side by side
combined <- grid.arrange(h_plot, s_plot, n_plot, ncol=3, top = "Variant scores in relation to position in protein")
##############
##save as pdf

# pdf("violin_Variant_scores_vs.pdf")
# plot(combined)
# plot(VAMP_abundance)
# dev.off()
##############
#works to save single plot
#ggsave("Variant_scores_protein_position.pdf", plot = combined, device = "pdf", path = "/Users/go2alyssa/Desktop/", scale = 2.6, dpi = "retina")

```

```{r}
# Scatter plot of VAMP-seq scores relative to variant position in protein, colored by secondary struct

#essentially getting rid of the last row (wt)
pten1_proc_wt <- pten1_proc[!is.na(pten1_proc$position),]

#creating new column that specifices secondary structure (uses info from helix and sheet columns)
pten1_proc_wt$secondary_struct <- ifelse(is.na(pten1_proc_wt$helix), "unknown",
                        ifelse(pten1_proc_wt$helix==1, "helix",
                        ifelse(pten1_proc_wt$sheet==1, "sheet",
                        ifelse(pten1_proc_wt$helix==0, "neither",
                        "unknown"))))

pten_pos <- ggplot(pten1_proc_wt, aes(x=position, y=score, colour=secondary_struct))+ geom_point(size=.3) + scale_x_continuous(minor_breaks = seq(0, 420, 5)) + scale_color_manual(values=c("#FF4848", "#00C853", "#5757FF", "#A9A9A9")) +ylab("VAMP-seq score")+xlab("Position in PTEN")+labs(colour="Secondary Structure")+ggtitle("PTEN scores in relation to protein structure")
#+ geom_vline(xintercept=27, color="black", size=.1) + geom_vline(xintercept=55, color="black", size=.1) + geom_vline(xintercept=70, color="black", size=.1) + geom_vline(xintercept=85, color="black", size=.1) + geom_vline(xintercept=164.5, color="black", size=.1) + geom_vline(xintercept=212, color="black", size=.1) + geom_vline(xintercept=267.5, color="black", size=.1) + geom_vline(xintercept=343.5, color="black", size=.1)+theme_bw()

#all the geom_vlines denoted exon boundaries, however, based on how the variant proteins were produced by the lab (no splicing involed), I decided to remove these markers

#colored by change in hydrophobicity instead of secondary struct
pten_hydro <- ggplot(pten1_proc_wt, aes(x=position, y=score, colour=(hydro2-hydro1)))+ geom_point(size=.3, alpha = 0.3) + scale_x_continuous(minor_breaks = seq(0, 420, 5)) + ylab("VAMP-seq score")+xlab("Position in PTEN")+labs(colour="Hydrophobicity")+ggtitle("PTEN scores in relation to change in hydrophobicity") 

#repeat for tpmt
tpmt1_proc_wt <- tpmt1_proc[!is.na(tpmt1_proc$position),]
tpmt1_proc_wt$secondary_struct <- ifelse(is.na(tpmt1_proc_wt$helix), "unknown",
                        ifelse(tpmt1_proc_wt$helix==1, "helix",
                        ifelse(tpmt1_proc_wt$sheet==1, "sheet",
                        ifelse(tpmt1_proc_wt$helix==0, "neither",
                        "unknown"))))
tpmt_pos <- ggplot(tpmt1_proc_wt, aes(x=position, y=score, colour=secondary_struct))+ geom_point(size=.3) + scale_x_continuous(minor_breaks = seq(0, 405, 5)) + scale_color_manual(values=c("#FF4848", "#00C853", "#5757FF", "#A9A9A9")) +ylab("VAMP-seq score")+xlab("Position in TPMT")+labs(colour="Secondary Structure")+ggtitle("TPMT scores in relation to protein structure") 
#+ geom_vline(xintercept=47, color="black", size=.1) + geom_vline(xintercept=78, color="black", size=.1) + geom_vline(xintercept=122.5, color="black", size=.1) + geom_vline(xintercept=140, color="black", size=.1) + geom_vline(xintercept=165, color="black", size=.1) + geom_vline(xintercept=194, color="black", size=.1) + geom_vline(xintercept=209, color="black", size=.1)

#creating a copy of tpmt1_proc_wt, addition of column that notes change from one secondary struct to another (for violin plot purposes) (plot tpmt_pos_vp to see)
tpmt_colors <- tpmt1_proc_wt
tpmt_colors$fact <- rep(10, nrow(tpmt_colors))
temp <- 1
for(i in 1:(length(tpmt_colors$fact)-1)) {
  if (tpmt_colors$secondary_struct[i] != tpmt_colors$secondary_struct[i+1]) {
    tpmt_colors$fact[i] <- temp
    temp <- temp + 1
  } else {
  tpmt_colors$fact[i] <- temp
  }
}
tpmt_colors$fact[length(tpmt_colors$fact)] <- temp


tpmt_pos_vp <- ggplot(tpmt_colors, aes(x=position, y=score))+ geom_violin(data=tpmt_colors[c(1:2783, 2798:4000),], aes(fill=as.character(fact), colour = factor(TRUE)), draw_quantiles = c(0.5), scale = "width") + 
scale_fill_manual(values=c("1" = "#A9A9A9", "2" = "#00C853", "3" = "#FF4848", "4" = "#00C853","5" = "#FF4848", "6" = "#00C853","7" = "#5757FF", "8" = "#00C853","9" = "#FF4848","10" = "#00C853","11" = "#5757FF", "12" = "#00C853","13" = "#FF4848", "14" = "#00C853", "15" = "#5757FF", "16" = "#00C853", "17" = "#5757FF", "18" = "#00C853", "19" = "#5757FF", "20" = "#00C853", "21" = "#5757FF", "22" = "#00C853", "23" = "#FF4848", "24" = "#5757FF", "25" = "#00C853", "26" = "#FF4848", "27" = "#00C853", "28" = "#5757FF", "29" = "#00C853", "30" = "#5757FF", "31" = "#00C853")) + scale_colour_manual(values = c("black")) + scale_x_continuous(minor_breaks = seq(0, 405, 5)) + ylab("VAMP-seq score")+xlab("Position in TPMT")+labs(colour="Secondary Structure")+ggtitle("TPMT scores in relation to protein structure")

# Scatter plot with change of hydrophobicity along x-axis, abundance score along y
pten_hydro1 <- ggplot(pten1_proc_wt, aes(y=score, x=(hydro2-hydro1)))+ geom_point(size=0.5, alpha = 0.3) + ylab("Hydrophobicity")+xlab("VAMP-seq score")+ggtitle("PTEN scores in relation to change in hydrophobicity")

# Violin and scatter plots of variant amino acid score spread (clustered by variant)
pten_aa_spread <- ggplot(pten1_proc_wt, aes(y=score, x=end)) + geom_violin(draw_quantiles=c(0.5))
pten_aa_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=end)) + geom_point(size = 0.5, alpha = 0.3, position = position_jitter(width = 0.1, height = NULL))

plot(pten_pos)
plot(tpmt_pos)

#plot(tpmt_pos_vp)
#plot(pten_hydro)
#plot(pten_hydro1)
plot(pten_aa_spread)
plot(pten_aa_spread1)
```
```{r}
# Looking for possible correlation between abundance and hydrogen bonding
# preprocessing data (probably unecessary, removes rows with NA in hbond_sum, adds column that once again denotes secondary structure)
pten1_hbond <- pten1_proc[!is.na(pten1_proc$hbond_sum),]
pten1_hbond$secondary_struct <- ifelse(is.na(pten1_hbond$helix), "unknown",
                        ifelse(pten1_hbond$helix==1, "helix",
                        ifelse(pten1_hbond$sheet==1, "sheet",
                        ifelse(pten1_hbond$helix==0, "neither",
                        "unknown"))))

# Scatter of abundance score vs hbond_sum (colored by secondary struct)
pten_plot_hbond <- ggplot(pten1_hbond, aes(x=hbond_sum, y=score, colour=secondary_struct))+ geom_point(alpha=0.4) + ylab("VAMP-seq score")+xlab("DSSP Sum of hydrogen bonds")+ggtitle("PTEN scores in relation to hydrogen bonding") + scale_color_manual(values=c("#FF4848", "#696969", "#5757FF")) + labs(colour="Secondary Structure") + theme_bw()

plot(pten_plot_hbond)

# (uncolored)
pten_plot_hbond1 <- ggplot(pten1_hbond, aes(x=hbond_sum, y=score))+ geom_point(alpha = 0.2) + ylab("VAMP-seq score")+xlab("DSSP Sum of hydrogen bonds")+ggtitle("PTEN scores in relation to hydrogen bonding") + theme_bw()

plot(pten_plot_hbond1)

#repeat for tpmt
tpmt1_hbond <- tpmt1_proc[!is.na(tpmt1_proc$hbond_sum),]
tpmt1_hbond$secondary_struct <- ifelse(is.na(tpmt1_hbond$helix), "unknown",
                        ifelse(tpmt1_hbond$helix==1, "helix",
                        ifelse(tpmt1_hbond$sheet==1, "sheet",
                        ifelse(tpmt1_hbond$helix==0, "neither",
                        "unknown"))))

tpmt_plot_hbond <- ggplot(tpmt1_hbond, aes(x=hbond_sum, y=score, colour=secondary_struct))+ geom_point(alpha=0.4) + ylab("VAMP-seq score")+xlab("DSSP Sum of hydrogen bonds")+ggtitle("PTEN scores in relation to hydrogen bonding") + scale_color_manual(values=c("#FF4848", "#696969", "#5757FF")) + labs(colour="Secondary Structure") + theme_bw()

plot(tpmt_plot_hbond)

tpmt_plot_hbond1 <- ggplot(tpmt1_hbond, aes(x=hbond_sum, y=score))+ geom_point(alpha = 0.2) + ylab("VAMP-seq score")+xlab("DSSP Sum of hydrogen bonds")+ggtitle("TPMT scores in relation to hydrogen bonding") + theme_bw()

plot(tpmt_plot_hbond1)

#less hydrogen bonds ~ higher abundance
```
```{r include=FALSE}
# To create multitrack (heat map, mean-variance, secondary struct)
#to calculate mean and variance for each position's variants' abundance scores 
tpmt_sum <- summarySE(tpmt1_proc_wt, measurevar="score", groupvars="position")

# Mean-variance plot
tpmt_pos_mean <- ggplot(tpmt_sum, aes(x=position, y=score))+ geom_bar(position=position_dodge(), stat="identity", colour="#999999") + geom_errorbar(aes(ymin=score-sd, ymax = score+sd), width=1, size=0.3, position=position_dodge()) +ylab("VAMP-seq score")+theme(axis.title.x = element_blank(), axis.text.x = element_blank()) + scale_x_continuous(breaks = seq(0, 245, 10), expand = c(0,0)) + scale_y_continuous(expand = c(0,0))

# Heat plot
tpmt_heat <- ggplot(tpmt1_proc_wt, aes(position, end)) + geom_tile(aes(fill=score)) + scale_fill_gradientn(colours = c("#3F7CB9", "#FFEAF3", "#B21F4E"), values = scales::rescale(c(-0.7, 0.2, 1, 1.3, 2.03)))+ scale_x_continuous(breaks = seq(0, 245, 10), expand=c(0,0)) + theme(legend.position='bottom')+xlab("Position in TPMT") + scale_y_discrete(expand = c(0,0))
#scale_fill_gradient2(low="#3F7CB9", mid="white", high="#B21F4E", midpoint=1) 
#scale_fill_distiller(palette= "RdBu")

# Secondary struct (altered from FowlerLab/VAMPseq github)
tpmt_dssp_schematic <- ggplot() + ggtitle("TPMT mean abundance scores") +
  geom_segment(aes(x = 1, y = 0, xend = max(tpmt1_data$position)), yend = 0, size = 1, color = "grey70") +
  geom_point(data = tpmt1_data, aes(x = position, y = 0), color = "black", size = 1.8) +
  geom_point(data = subset(tpmt1_data, sheet == 1), aes(x = position, y = 0), color = "pink", size = 1.5) +
  geom_point(data = subset(tpmt1_data, helix == 1), aes(x = position, y = 0), color = "cyan", size = 1.5) +
  scale_x_continuous(breaks = seq(0, 245, 10), expand = c(0,0)) +
  scale_y_continuous(breaks = NULL, expand = c(0,0)) +xlab(NULL) + ylab(NULL) +
  theme(panel.border = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank())

#plots
plot(tpmt_pos_mean)
plot(tpmt_heat)
tpmt_dssp_schematic


#grouping all variants in the same secondary structure together
#tpmt_aa_sum <- summarySE(tpmt_colors, measurevar="score", groupvars="fact")
#tpmt_aa_mean <- ggplot(tpmt_aa_sum, aes(x=fact, y=score))+ geom_bar(position=position_dodge(), stat="identity", colour="#999999") + geom_errorbar(aes(ymin=score-sd, ymax = score+sd), width=1, size=0.3, position=position_dodge()) +ylab("VAMP-seq score")+theme(axis.text.x = element_blank()) + scale_x_discrete(breaks = seq(0, 245, 10)) + xlab("each bar is a different secondary structure")

#plot(tpmt_aa_mean)
```
```{r include=FALSE}
# Repeat for PTEN

pten_sum <- summarySE(pten1_proc_wt, measurevar="score", groupvars="position")

pten_pos_mean <- ggplot(pten_sum, aes(x=position, y=score))+ geom_bar(position=position_dodge(), stat="identity", colour="#999999") + geom_errorbar(aes(ymin=score-sd, ymax = score+sd), width=1, size=0.3, position=position_dodge()) +ylab("VAMP-seq score")+theme(axis.title.x = element_blank(), axis.text.x = element_blank()) + scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) + scale_y_continuous(expand = c(0,0)) + geom_vline(xintercept=185, color="black", size=.1) + geom_vline(xintercept=350, color="black", size=.1)
pten_heat <- ggplot(pten1_proc_wt, aes(position, end)) + geom_tile(aes(fill=score)) + scale_fill_gradientn(colours = c("#3F7CB9", "#FFEAF3", "#B21F4E"), values = scales::rescale(c(-0.23, 0.42, 1, 1.2, 1.47)))+ scale_x_continuous(breaks = seq(0, 403, 20), expand=c(0,0)) + theme(legend.position='bottom')+xlab("Position in TPMT") + scale_y_discrete(expand = c(0,0))
#c(-0.7, 0.2, 1, 1.3, 2.03)
#c(-0.23, 0.42, 1, 1.2, 1.47)

#local
#source = https://github.com/FowlerLab/VAMPseq, PTEN_positional_data.tsv
#could have done it the same way as tpmt, but resulting graph has a little quirk 
pten_extra <- read.table(file = '~/leklab/leklab/PTEN_positional_data.tsv', sep = '\t', header = TRUE)
pten_dssp_schematic <- ggplot() + ggtitle("PTEN mean abundance scores") +
  geom_segment(aes(x = 1, y = 0, xend = max(pten_extra$position)), yend = 0, size = 1, color = "grey70") +
  geom_point(data = subset(pten_extra, !is.na(xca)), aes(x = position, y = 0), color = "black", size = 1.8) +
  geom_point(data = subset(pten_extra, sheet == 1), aes(x = position, y = 0), color = "pink", size = 1.5) +
  geom_point(data = subset(pten_extra, helix == 1), aes(x = position, y = 0), color = "cyan", size = 1.5) +
  scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) +
  scale_y_continuous(breaks = NULL, expand = c(0,0)) +xlab(NULL) + ylab(NULL) +
  theme(panel.border = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank())

#plots
plot(pten_pos_mean)
plot(pten_heat)
pten_dssp_schematic

```


```{r}
#method1 for visualizing tracks (basic, for visualizing in rstudio)
grid.newpage()
grid.draw(rbind(ggplotGrob(tpmt_dssp_schematic), ggplotGrob(tpmt_pos_mean), ggplotGrob(tpmt_heat), size = "last"))

grid.newpage()
grid.draw(rbind(ggplotGrob(pten_dssp_schematic), ggplotGrob(pten_pos_mean), ggplotGrob(pten_heat), size = "last"))
```
```{r include=FALSE}
#method2 (use for final layout, size specification, download)
gA=ggplot_gtable(ggplot_build(tpmt_pos_mean))
gB=ggplot_gtable(ggplot_build(tpmt_heat))
gC=ggplot_gtable(ggplot_build(tpmt_dssp_schematic))

ga=ggplot_gtable(ggplot_build(pten_pos_mean))
gb=ggplot_gtable(ggplot_build(pten_heat))
gc=ggplot_gtable(ggplot_build(pten_dssp_schematic))

maxWidth = grid::unit.pmax(gA$widths, gB$widths, gC$widths, ga$widths, gb$widths, gc$widths)
gA$widths <- as.list(maxWidth)
gB$widths <- as.list(maxWidth)
gC$widths <- as.list(maxWidth)
ga$widths <- as.list(maxWidth)
gb$widths <- as.list(maxWidth)
gc$widths <- as.list(maxWidth)

grid.newpage()

#to download as pdf, with specified widths!!
#pdf('pten_tpmt_mean_heat.pdf', width=8, height=6)
#grid.arrange(arrangeGrob(gC,gA,gB,nrow=3,heights=c(.1,.3,.8)))
#grid.arrange(arrangeGrob(gc,ga,gb,nrow=3,heights=c(.1,.3,.8)))
#dev.off()
```

```{r}
#plotting mean score for each variant amino acid (grouped by amino acid name)
tpmt_end_sum <- summarySE(tpmt1_proc_wt, measurevar="score", groupvars="end")
tpmt_end_mean <- ggplot(tpmt_end_sum, aes(x=end, y=score)) +
  geom_bar(position=position_dodge(), stat="identity", colour="#999999") +
  geom_errorbar(aes(ymin=score-sd, ymax=score+sd), width=0.001, position=position_dodge()) +
  ylab("mean abundance") + xlab("variant amino acid") + theme_bw()
plot(tpmt_end_mean)

#as expected, nonsense (X) has lowest mean abundance, as well as proline
```
```{r}
#does not show much

# # Violin overlaid scatter plotting all abundance scores vs 'end' aa colored by location/secondary structure
# tpmt_end_scores_b <- ggplot(data=subset(tpmt1_proc_wt, sheet==1), aes(x=end, y=score, colour=position)) +
#   geom_violin(data=subset(tpmt1_proc_wt, sheet==1), draw_quantiles=0.5, scale = "width") +
#   geom_point(data=subset(tpmt1_proc_wt, sheet==1), size=.3, alpha = 0.6, position=jitter2) + ylab("abundance") + xlab("variant aa") +
#   scale_y_continuous(limits = c(-0.8, 2.2))
# 
# tpmt_end_scores_a <- ggplot(data=subset(tpmt1_proc_wt, helix==1), aes(x=end, y=score, colour=position)) +
#   geom_violin(data=subset(tpmt1_proc_wt, helix==1), draw_quantiles=0.5, scale = "width") +
#   geom_point(data=subset(tpmt1_proc_wt, helix==1), size=.3, alpha = 0.6, position=jitter2) + ylab("abundance") + xlab("variant aa") +
#   scale_y_continuous(limits = c(-0.8, 2.2))
# 
# tpmt_end_scores_n <- ggplot(data=subset(tpmt1_proc_wt, secondary_struct=="neither"), aes(x=end, y=score, colour=position)) +
#   geom_violin(data=subset(tpmt1_proc_wt, secondary_struct=="neither"), draw_quantiles=0.5, scale = "width") +
#   geom_point(data=subset(tpmt1_proc_wt, secondary_struct=="neither"), size=.3, alpha = 0.6, position=jitter2) + ylab("abundance") + xlab("variant aa") +
#   scale_y_continuous(limits = c(-0.8, 2.2))
# 
# plot(tpmt_end_scores_b)
# plot(tpmt_end_scores_a)
# plot(tpmt_end_scores_n)

#tpmt_end_scores_60 <- ggplot(data=subset(tpmt1_proc_wt, position<=60), aes(x=end, y=score, colour=position)) +
#  geom_violin(data=subset(tpmt1_proc_wt, position<=60), draw_quantiles=0.5, scale = "width") +
#  geom_point(data=subset(tpmt1_proc_wt, position<=60), size=.3, alpha = 0.6, position=jitter2) + ylab("abundance") + xlab("variant aa") +
#  scale_y_continuous(limits = c(-0.8, 2.2))
#tpmt_end_scores_120 <- ggplot(data=subset(tpmt1_proc_wt, position>60 & position<=120), aes(x=end, y=score, colour=position)) +
#  geom_violin(data=subset(tpmt1_proc_wt, position>60 & position<=120), draw_quantiles=0.5, scale = "width") +
#  geom_point(data=subset(tpmt1_proc_wt, position>60 & position<=120), size=.3, alpha = 0.6, position=jitter2) + ylab("abundance") + xlab("variant aa") +
#  scale_y_continuous(limits = c(-0.8, 2.2))
#tpmt_end_scores_180 <- ggplot(data=subset(tpmt1_proc_wt, position>120 & position<=180), aes(x=end, y=score, colour=position)) +
#  geom_violin(data=subset(tpmt1_proc_wt, position>120 & position<=180), draw_quantiles=0.5, scale = "width") +
#  geom_point(data=subset(tpmt1_proc_wt, position>120 & position<=180), size=.3, alpha = 0.6, position=jitter2) + ylab("abundance") + xlab("variant aa") +
#  scale_y_continuous(limits = c(-0.8, 2.2))
#tpmt_end_scores_245 <- ggplot(data=subset(tpmt1_proc_wt, position>180 & position<=245), aes(x=end, y=score, colour=position)) +
#  geom_violin(data=subset(tpmt1_proc_wt, position>180 & position<=245), draw_quantiles=0.5, scale = "width") +
#  geom_point(data=subset(tpmt1_proc_wt, position>180 & position<=245), size=.3, alpha = 0.6, position=jitter2) + ylab("abundance") + xlab("variant aa") +
#  scale_y_continuous(limits = c(-0.8, 2.2))

#plot(tpmt_end_scores_60)
#plot(tpmt_end_scores_120)
#plot(tpmt_end_scores_180)
#plot(tpmt_end_scores_245)
```


```{r echo=FALSE}
# Focusing on individual "start", or reference, amino acids!
set.seed(153)
jitter <- position_jitter(width = 1, height = NULL)
jitter1 <-position_jitter(width = 0.08, height = NULL)
jitter2 <- position_jitter(width=0.13, height = NULL)
twenty_color = c("#D02028", "#E1A12F", "#EDD941", "#F2F08E", "#EEC898", "#BCDDAE", "#A4C33B", "#76C158", "#85782E", "#315935", "#53958B", "#A1DAE0", "#486EB6", "#E6A3B4", "#C5A0CA", "#554DA0", "#99247E", "#402059", "#82421B", "#7E807E", 'black')

#pten_k_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=start)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "K"), aes(group=factor(position))) + xlab("Position in PTEN") + ggtitle("Lysine variant abundance scores")

# Violin overlaid scatter of each reference lysine's variant scores vs position in protein
pten_k_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "K"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Lysine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "K"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()

# Violin overlaid scatter of each reference lysine's variant scores grouped by amino acid
pten_k_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "K"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Lysine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "K"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

# Repeat for glycine and cysteine and ...
pten_g_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "G"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Glycine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "G"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()

pten_g_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "G"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Glycine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "G"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

# Same as pten_[aa]_spread1, just colored by difference in hydrophobicity instead of variant aa
pten_g_hydrodiff <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "G"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Glycine variant abundance scores w/ hydrophobicity change")+ geom_point(data=subset(pten1_proc_wt, start== "G"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_distiller(palette = "Spectral") + theme_bw()

# Just basic scatter plot version of pten_c_spread1
pten_c_spread <- ggplot(pten1_proc_wt, aes(y=score, x=position)) + geom_point(data=subset(pten1_proc_wt, start== "C")) + xlab("Position in PTEN") + ggtitle("Cysteine variant abundance scores") + theme_bw()

pten_c_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=position, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "C"), aes(group=position%%450), scale = "width") + xlab("Position in PTEN") + ggtitle("Cysteine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "C"), aes(x=position, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

pten_c_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "C"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Cysteine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "C"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

pten_c_hydrodiff <- ggplot(pten1_proc_wt, aes(y=score, x=position, colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "C"), aes(group=position%%450), scale = "width") + xlab("Position in PTEN") + ggtitle("Cysteine variant abundance scores w/ hydrophobicity change")+ geom_point(data=subset(pten1_proc_wt, start== "C"), aes(x=position, y=score), alpha = 0.75, position=jitter1) + scale_color_distiller(palette = "Spectral") + theme_bw()


#pten_s_spread <- ggplot(pten1_proc_wt, aes(y=score, x=position)) + geom_point(data=subset(pten1_proc_wt, start== "S")) + xlab("Position in PTEN") + ggtitle("Serine variant abundance scores")
#pten_s_spread1_old <- ggplot(pten1_proc_wt, aes(y=score, x=start)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=factor(position))) + xlab("Position in PTEN") + ggtitle("Serine variant abundance scores")

pten_s_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Serine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()

pten_s_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Serine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

pten_s_hydrodiff <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Serine variant abundance scores w/ hydrophobicity change")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_distiller(palette = "Spectral") + theme_bw()

# Graphing abundance score (y-axis) vs change in hydrophobicity (x-axis)
pten_s_hh_hydrodiff <- ggplot(pten1_proc_wt, aes(y=score, x=factor(hydro2-hydro1), colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=factor(hydro2-hydro1)), scale = "width") + xlab("Change in hydrophobicity") + ggtitle("Serine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=factor(hydro2-hydro1), y=score), alpha = 0.75, position=jitter1) + theme_bw()

## Graphed data is exactly the same as pten_s_aa, but colors by change in hydrophobicity rather than variant
#pten_s_aa_hydrodiff <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Serine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=end, y=score), alpha = 0.75, position=jitter1)

# Different variations
#pten_s_aa_voldiff <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = voldiff)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Serine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=end, y=score), alpha = 0.75, position=jitter1)
#pten_s_aa_poldiff <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = polaritydiff)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Serine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=end, y=score), alpha = 0.75, position=jitter1)
#pten_s_aa_weightdiff <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = weightdiff)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Serine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=end, y=score), alpha = 0.75, position=jitter1)

#specific amino acid tests
##plot(pten_k_spread)
##plot(pten_g_spread)
##plot(pten_c_spread)

##plot(pten_s_spread)
##plot(pten_s_spread1_old)

#plot(pten_s_hh_hydrodiff) #probably not very useful... does not take into account position anymore
#plot(pten_s_aa_hydrodiff) #not as useful as pten_s_aa
##plot(pten_s_aa_voldiff)
##plot(pten_s_aa_poldiff)
##plot(pten_s_aa_weightdiff)

plot(pten_c_spread1)
plot(pten_c_aa)
plot(pten_c_hydrodiff)
plot(pten_s_spread1)
plot(pten_s_aa)
plot(pten_s_hydrodiff)
plot(pten_g_spread1)
plot(pten_g_aa)
plot(pten_g_hydrodiff)
plot(pten_k_spread1)
plot(pten_k_aa)

```

```{r}
# More of above
pten_a_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "A"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Alanine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "A"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()

pten_a_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "A"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Alanine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "A"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

pten_r_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "R"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Arganine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "R"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()

pten_r_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "R"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Arganine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "R"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

pten_n_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "N"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Asparagine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "N"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()

pten_n_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "N"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Asparagine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "N"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

pten_d_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "D"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Aspartic Acid variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "D"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()

pten_d_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "D"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Aspartic Acid variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "D"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

pten_n_hydrodiff <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "N"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Asparagine variant abundance scores w/ hydrophobicity change")+ geom_point(data=subset(pten1_proc_wt, start== "N"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_distiller(palette = "Spectral") + theme_bw()

pten_d_hydrodiff <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "D"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Aspartic Acid variant abundance scores w/ hydrophobicity change")+ geom_point(data=subset(pten1_proc_wt, start== "D"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_distiller(palette = "Spectral") + theme_bw()

plot(pten_a_spread1)
plot(pten_a_aa)
plot(pten_r_spread1)
plot(pten_r_aa)
plot(pten_n_spread1)
plot(pten_n_hydrodiff)
plot(pten_n_aa)
plot(pten_d_spread1)
plot(pten_d_hydrodiff)
plot(pten_d_aa)

```
```{r}
# Even more from above... plots not named

#graphs for proline (equiv to pten_p_spread1, pten_p_aa, pten_p_hydrodiff)
ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "P"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Proline variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "P"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()
ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "P"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Proline variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "P"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()
ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "P"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Proline variant abundance scores w/ hydrophobicity change")+ geom_point(data=subset(pten1_proc_wt, start== "P"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_distiller(palette = "Spectral") + theme_bw()
```
```{r}
#graphs for Threonine
ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "T"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Threonine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "T"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color) + theme_bw()
ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "T"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Threonine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "T"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()
ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "T"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Threonine variant abundance scores w/ hydrophobicity change")+ geom_point(data=subset(pten1_proc_wt, start== "T"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_distiller(palette = "Spectral") + theme_bw()

```


```{r}
# To plot positions with low stddev (excluding nonsense)

#creating new dataframe which includes mean abundance and variance, and discludes nonsense mutations (nonsense mutations cut proteins short and usually result in very low abundance scores, so I've decided to treat them separately from missense/synonymous mutations)
pten_variance <- summarySE((subset(pten1_data, end != "X")), measurevar="score", groupvars="position", na.rm=TRUE)

# Filtering for positions with enough variants and low variance in variant abundance scores
#adjust N (minimum # variants at a position) and stddev to liking
#5, 0.11
#10, 0.15
pten_variance_filtered <- subset(subset(pten_variance, N > 8 ), sd < 0.11)

#nonsense variant scores are graphed as dots, but are not included in the overlaying violin plots
ggplot(no_nonsense, aes(y=score, x=factor(position), colour = end)) + 
  geom_violin(draw_quantiles=c(0.5), data=subset(no_nonsense, no_nonsense$position %in% pten_variance_filtered$position), aes(group=factor(position)), scale = "width") + 
  xlab("Position in PTEN") + ggtitle("Intolerant and tolerant amino acid positions") +
  geom_point(data=subset(pten1_proc_wt, pten1_proc_wt$position %in% pten_variance_filtered$position), aes(x=factor(position), y=score), alpha = 0.85, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

ggplot(no_nonsense, aes(y=score, x=position, colour = end)) + 
  geom_violin(draw_quantiles=c(0.5), data=subset(no_nonsense, no_nonsense$position %in% pten_variance_filtered$position), aes(group=position%%450), scale = "width") + 
  xlab("Position in PTEN") + ggtitle("Intolerant and tolerant amino acid positions") +
  geom_point(data=subset(pten1_proc_wt, pten1_proc_wt$position %in% pten_variance_filtered$position), aes(x=position, y=score), alpha = 0.85, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

```
```{r}
# Repeat of above ,just testing different N and stddev parameter 
pten_variance_filtered1 <- subset(subset(pten_variance, N > 10), sd > 0.25)

ggplot(no_nonsense, aes(y=score, x=factor(position), colour = end)) + 
  geom_violin(draw_quantiles=c(0.5), data=subset(no_nonsense, no_nonsense$position %in% pten_variance_filtered1$position), aes(group=factor(position)), scale = "width") + 
  xlab("Position in PTEN") + ggtitle("Intolerant and tolerant amino acid positions") +
  geom_point(data=subset(pten1_proc_wt, pten1_proc_wt$position %in% pten_variance_filtered1$position), aes(x=factor(position), y=score), alpha = 0.85, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()

ggplot(no_nonsense, aes(y=score, x=position, colour = end)) + 
  geom_violin(draw_quantiles=c(0.5), data=subset(no_nonsense, no_nonsense$position %in% pten_variance_filtered1$position), aes(group=position%%450), scale = "width") + 
  xlab("Position in PTEN") + ggtitle("Intolerant and tolerant amino acid positions") +
  geom_point(data=subset(pten1_proc_wt, pten1_proc_wt$position %in% pten_variance_filtered1$position), aes(x=position, y=score), alpha = 0.85, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()


# Checking to see variant scores of positions where clinvar has documented existence of a path/likely path variant in the position
#chosen includes positions of clinvar variants
chosen <- c(61, 68, 105, 108, 123, 127, 130, 132, 135, 155, 165, 173, 174, 246, 323, 335)

ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + 
  geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, pten1_proc_wt$position %in% chosen), aes(group=factor(position)), scale = "width") + 
  xlab("Position in PTEN") + ggtitle("Clinvar path/likely path variant positions") +
  geom_point(data=subset(pten1_proc_wt, pten1_proc_wt$position %in% chosen), aes(x=factor(position), y=score), alpha = 0.85, position=jitter1) + scale_color_manual(values=twenty_color) + theme_bw()
#expected to see some positions with variants all with low abundance, which was somewhat the case? 68, 105, 155, 256 may show this
#results that differ may reflect that actual variant aa qualities, and not just position, affect abundance score

```
```{r include=FALSE}
# Plotting which positions in the protein have low variance in abundances scores, was potentially going to use as an additional in the heat-mean-variance-secondary-struct plot, but turned out ... quite poorly

ggplot(no_nonsense, aes(y=score, x=position)) + 
  geom_bar(data=subset(no_nonsense, no_nonsense$position %in% pten_variance_filtered1$position), position=position_dodge(), stat="identity", colour="#999999") +
  geom_errorbar(data=subset(no_nonsense, no_nonsense$position %in% pten_variance_filtered1$position), aes(ymin=score-sd, ymax=score+sd), width=1, size=0.3, position=position_dodge()) +
  xlab("Position in PTEN") + ggtitle("Intolerant and tolerant amino acid positions") +
  scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) + scale_y_continuous(expand = c(0,0))+theme(axis.title.x = element_blank(), axis.text.x = element_blank(), legend.position="none")

pten_variance_filtered2 <- subset(subset(pten_variance, N > 10), sd > 0.35)

ggplot(no_nonsense, aes(y=sd, x=position)) +
  geom_bar(data=subset(no_nonsense, no_nonsense$position %in% pten_variance_filtered$position), position=position_dodge(), stat="identity", colour="#999999")

#all sd over .25, N>10
ggplot(pten_sum, aes(y=sd, x=position)) +
  geom_point(data=subset(pten_sum, pten_sum$position %in% pten_variance_filtered1$position)) +
  scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) + scale_y_continuous(expand = c(0,0))+theme(axis.title.x = element_blank(), axis.text.x = element_blank(), legend.position="none")


#all sd
ggplot(pten_sum, aes(y=sd, x=position)) +
  geom_segment(aes(x=position, xend=position, y=0, yend=sd), color="grey48") +
  #geom_point() +
  #geom_errorbar(data=subset(no_nonsense, no_nonsense$position %in% pten_variance_filtered2$position), aes(ymin=score-sd, ymax = score+sd), width=1, size=0.3, position=position_dodge()) + 
  scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) + scale_y_continuous(expand = c(0,0))+theme(axis.title.x = element_blank(), axis.text.x = element_blank(), legend.position="none")

pten_sum_filt <- pten_sum
pten_sum_filt$sd2 = pten_variance_filtered4[match(pten_sum_filt$position, pten_variance_filtered4$position), "sd"]
pten_pos_colored_mean <- ggplot(pten_sum_filt, aes(x=position, y=score))+ geom_bar(position=position_dodge(), stat="identity", colour=pten_sum_filt$sd2) + geom_errorbar(aes(ymin=score-sd, ymax = score+sd), width=1, size=0.3, position=position_dodge()) +ylab("VAMP-seq score")+theme(axis.title.x = element_blank(), axis.text.x = element_blank()) + scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) + scale_y_continuous(expand = c(0,0)) + geom_vline(xintercept=185, color="black", size=.1) + geom_vline(xintercept=350, color="black", size=.1)+ scale_colour_gradient(low = "white", high = "blue")
plot(pten_pos_colored_mean)
# Focusing on individual "end", or variant, amino acids! A lot more can be done with this, and a lot needs to be improved
```



```{r}
# New track for heat-mean-variance-secondary-struct plot

#this is how pten_variance was defined: pten_variance <- summarySE((subset(pten1_data, end != "X")), measurevar="score", groupvars="position", na.rm=TRUE)
pten_variance_filtered4 <-subset(pten_variance, N>5)
variance_bar <- ggplot(pten_variance_filtered4, aes(y=sd, x=position)) +
  geom_segment(aes(x=position, xend=position, y=0, yend=sd), color="grey68") +
  geom_point(size=0.5) +
  scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) + scale_y_continuous(expand = c(0,0))+theme(axis.title.x = element_blank(), axis.text.x = element_blank(), legend.position="none")
plot(variance_bar)

gd=ggplot_gtable(ggplot_build(variance_bar))
maxWidth = grid::unit.pmax(ga$widths, gb$widths, gc$widths, gd$widths)
ga$widths <- as.list(maxWidth)
gb$widths <- as.list(maxWidth)
gc$widths <- as.list(maxWidth)
gd$widths <- as.list(maxWidth)
grid.newpage()

##storing, with specified widths!!
#pdf('pten_tpmt_mean_heat_variance.pdf', width=8, height=6)
#grid.arrange(arrangeGrob(gc,gd,ga,gb,nrow=4,heights=c(.1,.15,.15,.6)))
#dev.off()
```


```{r include=FALSE}
#on hold, creation of amino acid dataframe to plot abundance scores against
name <- c('Ala', 'Arg', 'Asn', 'Asp', 'Cys', 'Glu', 'Gln', 'Gly', 'His', 'Ile', 'Leu', 'Lys', 'Met', 'Phe', 'Pro', 'Ser', 'Thr', 'Trp', 'Tyr', 'Val')
quality <- c('Hydrophobic', 'Basic', 'Polar Neutral', 'Acidic', 'Polar Neutral', 'Acidic', 'Polar Neutral', 'Glycine', 'Basic', 'Hydrophobic', 'Hydrophobic', 'Basic', 'Hydrophobic', 'Hydrophobic', 'Hydrophobic', 'Polar Neutral', 'Polar Neutral', 'Hydrophobic', 'Hydrophobic', 'Hydrophobic')
#abundance <- get better scale
abundance <- c(0.0884, 0.057, 0.0417, 0.0539, 0.0124, 0.0624, 0.0382, 0.0703, 0.0220, 0.0595, 0.0994, 0.0527, 0.0237, 0.04, 0.0471, 0.0672, 0.0543, 0.0121, 0.03, 0.0677)
#isoelectric point <- unknown source (ncbi)
isoelectric <- c(6, 10.8, 5.4, 3, 5, 3.2, 5.7, 6, 7.6, 6, 6, 9.7, 5.7, 5.5, 6.3, 5.7, 5.6, 5.9, 5.7, 6.0)
hp_k_d <- c(1.8, -4.5, -3.5, -3.5, 2.5, -3.5, -3.5, -0.4, -3.2, 4.5, 3.8, -3.9, 1.9, 2.8, -1.6, -0.8, -0.7, -0.9, -1.3, 4.2)
hp_janin <-c(0.3, -1.4, -0.5, -0.6, 0.9, -0.7, -0.7, 0.3, -0.1, 0.7, 0.5, -1.8, 0.4, 0.5, -0.3, -0.1, -0.2, 0.3, -0.4, 0.6)
#Monera et al., J. Protein Sci (pro (-46) may be sketch)
hp_ph7 <- c(41, -14, -28, -55, 49, -31, -10, 0, 8, 99, 97, -23, 74, 100, -46, -5, 13, 97, 63, 76)
h_bonds <- c(0, 7, 5, 4, 0, 4, 5, 0, 3, 0, 0, 3, 0, 0, 0, 3, 3, 1, 3, 0)
mol_weight <-c(71, 156, 114, 115, 103, 129, 128, 57, 137, 113, 113, 128, 131, 147, 97, 87, 101, 186, 163, 99)

amino_acids.data <- data.frame(name, quality, abundance, isoelectric, hp_k_d, hp_janin, hp_ph7, h_bonds, mol_weight)

```

```{r}
# Multiplex paper's Fig2b (PTEN) shows trailing tails for the nonsense and synonymous variant scores
# this is to identify and investigate why variants in this tail are so far from their means

pten1_nonsense <- subset(pten1_proc, class == "nonsense")
tpmt1_nonsense <- subset(tpmt1_proc, class == "nonsense")
pten1_synon <- subset(pten1_proc, class == "synonymous")
tpmt1_synon <- subset(tpmt1_proc, class == "synonymous")

pten1_no_missense <- subset(pten1_proc, class == "synonymous" | class == "nonsense")

ggplot(pten1_nonsense, aes(x=score)) + geom_histogram(binwidth=.01, colour="blue", fill="white") + theme_bw()
#+ geom_density()
ggplot(pten1_synon, aes(x=score)) + geom_histogram(binwidth=.01, colour="red", fill="white") + theme_bw()

ggplot(pten1_proc_wt, aes(x=score)) + geom_histogram(data=subset(pten1_proc_wt,class == "nonsense"), fill = "red", alpha = 0.5, binwidth=.01) + geom_histogram(data=subset(pten1_proc_wt,class == "synonymous"), fill = "blue", alpha = 0.5, binwidth=.01) + geom_histogram(data=subset(pten1_proc_wt,class == "missense"), fill = "green", alpha = 0.2, binwidth=.01) + theme_bw()

ggplot(pten1_no_missense, aes(x=score)) + geom_histogram(data=subset(pten1_no_missense,class == "nonsense"), fill = "red", alpha = 0.5, binwidth=.01) + geom_histogram(data=subset(pten1_no_missense,class == "synonymous"), fill = "blue", alpha = 0.5, binwidth=.01) + theme_bw()

# TPMT
#ggplot(tpmt1_nonsense, aes(x=score)) + geom_histogram(binwidth=.01, colour="blue", fill="white") + theme_bw()
#ggplot(tpmt1_synon, aes(x=score)) + geom_histogram(binwidth=.01, colour="red", fill="white") + theme_bw()


# we can see the paper used a different scale, which was ... deceiving
# not many variants are nonsense or synonymous, so the tail is not actually that large for both of them
```
```{r}
# Taking a subset of the data, all variants above a certain score (for nonsense), and below a certain score (for synonymous)

#0.55
nonsense_tail <- subset(pten1_nonsense, score > 0.6)
synon_tail <- subset(pten1_synon, score < 0.6)
nonsense_tail$secondary_struct <- ifelse(is.na(nonsense_tail$helix), "unknown",
                        ifelse(nonsense_tail$helix==1, "helix",
                        ifelse(nonsense_tail$sheet==1, "sheet",
                        ifelse(nonsense_tail$helix==0, "neither",
                        "unknown"))))
synon_tail$secondary_struct <- ifelse(is.na(synon_tail$helix), "unknown",
                        ifelse(synon_tail$helix==1, "helix",
                        ifelse(synon_tail$sheet==1, "sheet",
                        ifelse(synon_tail$helix==0, "neither",
                        "unknown"))))

n_tail <- nonsense_tail[,c(1,2,7,30,127)]
s_tail <- synon_tail[,c(1,2,7,30,127)]
n_tail$bp_pos <- (n_tail$position-1)*3
s_tail$bp_pos <- (s_tail$position-1)*3

n_tail
s_tail
```
```{r}
# Graphing the positions of these variants in the protein
#just in case there is a discernible pattern
s_tail_pos <- ggplot(s_tail, aes(x=position, y=score, colour=secondary_struct))+ geom_point(size=.3) + scale_x_continuous(minor_breaks = seq(0, 405, 5)) + scale_color_manual(values=c("#FF4848", "#00C853", "#5757FF", "#A9A9A9")) +ylab("VAMP-seq score")+xlab("Position in PTEN")+labs(colour="Secondary Structure")+ggtitle("PTEN synonymous variant tail scores in relation to protein structure") + theme_bw()
plot(s_tail_pos)

n_tail_pos <- ggplot(n_tail, aes(x=position, y=score, colour=secondary_struct))+ geom_point(size=.3) + scale_x_continuous(minor_breaks = seq(0, 405, 5)) + scale_color_manual(values=c("#FF4848", "#00C853", "#5757FF", "#A9A9A9")) +ylab("VAMP-seq score")+xlab("Position in PTEN")+labs(colour="Secondary Structure")+ggtitle("PTEN nonsense variant tail scores in relation to protein structure") + theme_bw()
plot(n_tail_pos)

# synonymous are scattered throughout protein
# makes sense that nonsense mutations occuring at tail end of protein have higher abundance
```

```{r include=FALSE}
# originally to see if low abundance synonymous variants had anything to do with splice sites, but once again, the way the library of proteins was constructed eliminates splice sites as a possibility
s_tail$prob_AG_GT <- c(0, 1/6, 1/2, 0, 1/2, 1/6)
s_tail$prob_titv <- c(0, 2/3, 2/3, 0, 2/3, 1/3)
ggplot(n_tail, aes(x=position,y=score)) + geom_point() + geom_smooth(method = "lm")
ggplot(s_tail, aes(x=prob_titv,y=score)) + geom_point() + geom_smooth(method = "lm")
ggplot(s_tail, aes(y=prob_titv,x=score)) + geom_point() + geom_smooth(method = "lm")
rsq <- function (x, y) cor(x, y)^2
n_rsq <- rsq(n_tail$position, s_tail$score)
s_rsq <- rsq(s_tail$prob_titv, s_tail$score)
n_rsq
s_rsq
#no relationship...
```

```{r include=FALSE}
#local
#getting dbNSFP's TPMT information from ruddle (downloaded from ruddle onto google sheets)

# on ruddle, `/ycga-gpfs/project/ysm/lek/ml2529/reference/dbNSFPv2.9.3.gz`
# `module load HTSlib/1.4.1-foss-2016a`
# `module load tabix/0.2.6-foss-2016b`
# extract all of the variants in TPMT with `tabix /ycga-gpfs/project/ysm/lek/ml2529/reference/dbNSFPv2.9.3.gz 6:18128542-18155305`

gs_ls()
tpmt_ruddle <- gs_title("TPMT_ruddle")
tpmt_read <- gs_read(ss=tpmt_ruddle, ws = "ruddle_tpmt_variants")
tpmt_ruddle_data <- as.data.frame(tpmt_read)
```
```{r echo=FALSE}
#data in dbNSFP is ordered in the reverse direction
#reversing data to fit our tpmt1_data
rever <- function(df=tpmt_ruddle_data){df<-df[dim(df)[1]:1,]}
tpmt_ruddle_data_rev = rever(tpmt_ruddle_data)

#creating variant column, equiv to tpmt1_data's
tpmt_ruddle_data_rev$variant <- do.call(paste, c(tpmt_ruddle_data_rev[c(5,24,6)], sep=""))

#making both tables smaller (getting rid of columns we don't need for easier viewing in RStudio)
tpmt_essential <- tpmt_ruddle_data_rev[,c(2,3,4,5,6,17,19,24,27,28,29,30,31,32,33,34,35,76,77,78,137)]
tpmt1_proc_ess <- tpmt1_proc_wt[,c(1,2,3,5,6,7,30,32,80)]

#merging tables with variant name
tpmt_merge <- merge(tpmt1_proc_ess, tpmt_essential, by="variant")

#comparing abundance scores with various scores in dbNSFP (contains annotations of all potential non-synonymous single-nucleotide variants (nsSNVs) in the human genome)
tpmt_cor1 <- ggplot(tpmt_merge, aes(x=score, y=as.numeric(SIFT_score)))+ geom_point(alpha = 0.2) + xlab("VAMP-seq score")+ylab("SIFT score")+ggtitle("1") + theme_bw()
tpmt_cor1.5 <- ggplot(tpmt_merge, aes(x=score, y=as.numeric(SIFT_converted_rankscore)))+ geom_point(alpha = 0.2) + xlab("VAMP-seq score")+ylab("SIFT converted rankscore")+ggtitle("1.5") + theme_bw()
tpmt_cor5 <- ggplot(tpmt_merge, aes(x=score, y=CADD_raw_rankscore))+ geom_point(alpha = 0.2) + xlab("VAMP-seq score")+ylab("CADD raw rankscore")+ggtitle("5") + theme_bw()
tpmt_cor2 <- ggplot(tpmt_merge, aes(x=score, y=as.numeric(Polyphen2_HDIV_score)))+ geom_point(alpha = 0.2) + xlab("VAMP-seq score")+ylab("Polyphen2 HDIV score")+ggtitle("2") + theme_bw()
tpmt_cor3 <- ggplot(tpmt_merge, aes(x=score, y=as.numeric(Polyphen2_HVAR_score)))+ geom_point(alpha = 0.2) + xlab("VAMP-seq score")+ylab("Polyphen2 HVAR score")+ggtitle("3") + theme_bw()
tpmt_cor2.5 <- ggplot(tpmt_merge, aes(x=score, y=as.numeric(Polyphen2_HDIV_rankscore)))+ geom_point(alpha = 0.2) + xlab("VAMP-seq score")+ylab("Polyphen2 HDIV rankscore")+ggtitle("2.5") + theme_bw()
tpmt_cor3.5 <- ggplot(tpmt_merge, aes(x=score, y=as.numeric(Polyphen2_HVAR_rankscore)))+ geom_point(alpha = 0.2) + xlab("VAMP-seq score")+ylab("Polyphen2 HVAR rankscore")+ggtitle("3.5") + theme_bw()

#CADD_phred not worth

#plot(tpmt_cor5)
#plot(tpmt_cor1)
#plot(tpmt_cor1.5)
plot(tpmt_cor2)
plot(tpmt_cor3)
plot(tpmt_cor2.5)
plot(tpmt_cor3.5)
```
```{r}
#dbNSFP's various scores plotted against abundance score

TPMT_abun_CADD <- ggplot(tpmt_merge, aes(x=abundance_class, y=CADD_raw_rankscore)) + geom_violin(draw_quantiles = c( 0.5))+ylab("CADD raw rankscore")+xlab("Abundance Class") + theme_bw()
plot(TPMT_abun_CADD)

TPMT_abun_SIFT_conv <- ggplot(tpmt_merge, aes(x=abundance_class, y=as.numeric(SIFT_converted_rankscore))) + geom_violin(draw_quantiles = c(0.5))+ylab("SIFT conv rankscore")+xlab("Abundance Class") + theme_bw()
plot(TPMT_abun_SIFT_conv)

TPMT_abun_POLY <- ggplot(tpmt_merge, aes(x=abundance_class, y=as.numeric(Polyphen2_HDIV_rankscore))) + geom_violin(draw_quantiles = c( 0.5))+ylab("Polyphen2 HDIV rankscore")+xlab("Abundance Class") + theme_bw()
plot(TPMT_abun_POLY)

TPMT_abun_POLY1 <- ggplot(tpmt_merge, aes(x=abundance_class, y=as.numeric(Polyphen2_HVAR_rankscore))) + geom_violin(draw_quantiles = c( 0.5))+ylab("Polyphen2 HVAR rankscore")+xlab("Abundance Class") + theme_bw()
plot(TPMT_abun_POLY1)
```
```{r}
# Bar plots of what Polyphen and SIFT scores make up each abundance class!
Pred_abun_SIFT <- ggplot(tpmt_merge, aes(abundance_class)) + geom_bar(aes(fill = SIFT_pred)) + ggtitle("Abundance class vs SIFT prediction of Damaging or Tolerated") + theme_bw()
plot(Pred_abun_SIFT)

trial_sep <- tpmt_merge[c(21,23,24,26)]
tpmt_merge_expand <- separate_rows(tpmt_merge, c("Polyphen2_HDIV_score", "Polyphen2_HDIV_pred", "Polyphen2_HVAR_score", "Polyphen2_HVAR_pred"))

Pred_abun_HVAR <- ggplot(tpmt_merge_expand, aes(abundance_class)) + geom_bar(aes(fill = Polyphen2_HVAR_pred)) + ggtitle("Abundance class vs Polyphen2 HVAR predictions") + labs(subtitle = "D: Probably Damaging, P: Possibly Damaging, B: Benign") + theme_bw()
plot(Pred_abun_HVAR)
```

```{r include=FALSE}
# For PyMOL, creation of array to use to color PTEN and TPMT

#creation of b-score text files for pymol use
pten_pymol <- summarySE(pten1_data, measurevar="score", groupvars="position", na.rm=TRUE)
#score[404] is wt
write.table(pten_pymol$score[1:403], "pten_mean_scores_pymol.txt", sep="\n", row.names=F, col.names=F, na = "NaN")

tpmt_pymol <- summarySE(tpmt1_data, measurevar="score", groupvars="position", na.rm=TRUE)
#score[404] is wt
write.table(tpmt_pymol$score[1:245], "tpmt_mean_scores_pymol.txt", sep="\n", row.names=F, col.names=F, na = "NaN")
```

```{r}
# Focusing on individual "end", or variant, amino acids! A lot more can be done... and a lot of what I have here is unfocused and too busy to be helpful

twenty_color1 = c("#D02028", "#A4C33B","#53958B", "#E6A3B4", "#C5A0CA", "#554DA0", "#99247E", "#402059", "#82421B", "#7E807E", 'black', "#EDD941", "#F2F08E", "#EEC898", "#E1A12F", "#76C158",  "#BCDDAE", "#85782E", "#315935", "#A1DAE0", "#486EB6")

# Just to use as a reference track
pten_dssp_schematic1 <- ggplot() +
  geom_segment(aes(x = 1, y = 0, xend = max(pten_extra$position)), yend = 0, size = 1, color = "grey70") +
  geom_point(data = subset(pten_extra, !is.na(xca)), aes(x = position, y = 0), color = "black", size = 1.8) +
  geom_point(data = subset(pten_extra, sheet == 1), aes(x = position, y = 0), color = "pink", size = 1.5) +
  geom_point(data = subset(pten_extra, helix == 1), aes(x = position, y = 0), color = "cyan", size = 1.5) +
  scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) +
  scale_y_continuous(breaks = NULL, expand = c(0,0)) +xlab("Position in PTEN") + ylab("\n \n \n") +
  theme(panel.border = element_blank(), axis.text.y = element_blank())

# different amino acid lists, can be used to swap in and out of the below graphs if you don't want to plot all amino acids
aas <- c("A", "C", "P", "X")
aas1 <- c("S", "C", "P", "X")
aas2 <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y")

# Scatter plot of all Alanine variants (colored by reference amino acid)
pten_a_pos <- ggplot(data=subset(pten1_proc_wt, end=="A"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="A" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Alanine variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)
#plot(pten_a_pos)

# Repeated for Serine, Cysteine, Nonsense, etc
pten_s_pos <- ggplot(data=subset(pten1_proc_wt, end=="S"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="S" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Serine variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)

pten_c_pos <- ggplot(data=subset(pten1_proc_wt, end=="C"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="C" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Cysteine variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)

pten_x_pos <- ggplot(data=subset(pten1_proc_wt, end=="X"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="X" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of nonsense variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)

grid.newpage()
grid.draw(rbind(ggplotGrob(pten_a_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))
grid.newpage()
grid.draw(rbind(ggplotGrob(pten_s_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))
grid.newpage()
grid.draw(rbind(ggplotGrob(pten_c_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))
grid.newpage()
grid.draw(rbind(ggplotGrob(pten_x_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))
```
```{r}
# More of above
#snake at the end (arches over grey region)
pten_p_pos <- ggplot(data=subset(pten1_proc_wt, end=="P"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="P" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Proline variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)
grid.newpage()
grid.draw(rbind(ggplotGrob(pten_p_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))
```
```{r}
# More of above
#matching snake at the end!
pten_g_pos <- ggplot(data=subset(pten1_proc_wt, end=="G"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="G" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Glycine variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)
grid.newpage()
grid.draw(rbind(ggplotGrob(pten_g_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))

pten_h_pos <- ggplot(data=subset(pten1_proc_wt, end=="H"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="H" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Histidine variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)
grid.newpage()
grid.draw(rbind(ggplotGrob(pten_h_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))

# notice 2nd to last light grey region on reference track, all variants are high abundance there
pten_w_pos <- ggplot(data=subset(pten1_proc_wt, end=="W"), aes(x=position, y=score, colour=start))+ geom_point(data=subset(pten1_proc_wt, end=="W" & start %in% aas2), size=.6) + scale_x_continuous(breaks=seq(0, 403, 20), minor_breaks = seq(0, 403, 5), limits = c(0, 403), expand = c(0,0)) + scale_y_continuous(expand = c(0,0), limits = c(-0.25, 1.5)) + scale_color_manual(values=twenty_color1) +ylab("Abundance score")+labs(colour="Reference amino acid")+xlab(NULL)+ggtitle("Abundance scores of Typtophan variants") + theme(axis.text.x = element_blank(), legend.position='top') + geom_hline(yintercept=1, color="black", size=.1)
grid.newpage()
grid.draw(rbind(ggplotGrob(pten_w_pos), ggplotGrob(pten_dssp_schematic1), size = "last"))
```



